Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.
library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
library(parallel)
library(doParallel)
set.seed(1)
colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
"Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
"Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
"Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
"Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
"Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
"Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
"Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
"Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
"Image_Width","Image_X","Image_Y","Intensity","Length",
"Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
"Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
"Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
"Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")
dd_all <- read_csv("../Class_18C_normalLight/1st_class_2020/Data/libraries_FC100/dd_all.csv", show_col_types = FALSE)
# dd_all$species <- ifelse(dd_all$species=="small_unidentified_possibly_small_digested_algae", "Small_unidentified",dd_all$species)
# table(dd_all$species)
Chlamydomonas <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Chlamydomonas_light_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Cosmarium_light_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Desmodesmus <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Desmodesmus_light_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Monoraphidium_light_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum1_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_normalLight/2nd_class_2022Feb/Data/Staurastrum2_light_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_feb22 <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_feb22) <- colnames
# table(dd_all_feb22$species)
Chlamydomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Chlamydomonas_light_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cosmarium_light_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Cryptomonas_light_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Desmodesmus_light_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Monoraphidium_light_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum1_light_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_normalLight/3rd_data_20220316/Data/Staurastrum2_light_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220316 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220316) <- colnames
# table(dd_all_20220316$species)
Chlamydomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Chlamydomonas_light_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cosmarium_light_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Cryptomonas_light_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Desmodesmus_light_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Monoraphidium_light_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum1_light_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_normalLight/4th_data_20220406/Data/Staurastrum2_light_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220406 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220406) <- colnames
# table(dd_all_20220406$species)
Chlamydomonas <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Chlamydomonas_light_July_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Cosmarium_light_July_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Cryptomonas_light_July_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Desmodesmus_light_July_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Monoraphidium_light_July_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
# Staurastrum1 <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Staurastrum2_light_July_2022.csv", show_col_types = FALSE)
# Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_normalLight/5th_data_20220710/Data/Staurastrum2_light_July_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220710 <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,
# Staurastrum1,
Staurastrum2)
colnames(dd_all_20220710) <- colnames
# table(dd_all_20220710$species)
Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Chlamydomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cosmarium_dark_February_2022_library.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Desmodesmus_dark_February_2022_library.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Monoraphidium_dark_February_2022_library.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum1_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Staurastrum2_dark_February_2022_library.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_18perc/Data/Cryptomonas_dark_February_2022_library.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
dd_all_feb22_dark <- rbind(Chlamydomonas,Cosmarium,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2,
Cryptomonas)
colnames(dd_all_feb22_dark) <- colnames
# table(dd_all_feb22_dark$species)
Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Chlamydomonas_dark_March_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cosmarium_dark_March_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Cryptomonas_dark_March_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Desmodesmus_dark_March_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Monoraphidium_dark_March_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum1_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_6perc/Data/Staurastrum2_dark_March_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220316_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220316_dark) <- colnames
# table(dd_all_20220316_dark$species)
rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)
Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Chlamydomonas_dark_April_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cosmarium_dark_April_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Cryptomonas_dark_April_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Desmodesmus_dark_April_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Monoraphidium_dark_April_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum1_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data/Staurastrum2_dark_April_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220406_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,Staurastrum1,Staurastrum2)
colnames(dd_all_20220406_dark) <- colnames
# table(dd_all_20220406_dark$species)
rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium, Staurastrum1, Staurastrum2)
Chlamydomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Chlamydomonas_dark_July_2022.csv", show_col_types = FALSE)
Chlamydomonas$species <- "Chlamydomonas"
Cosmarium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Cosmarium_dark_July_2022.csv", show_col_types = FALSE)
Cosmarium$species <- "Cosmarium"
Cryptomonas <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Cryptomonas_dark_July_2022.csv", show_col_types = FALSE)
Cryptomonas$species <- "Cryptomonas"
Desmodesmus <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Desmodesmus_dark_July_2022.csv", show_col_types = FALSE)
Desmodesmus$species <- "Desmodesmus"
Monoraphidium <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Monoraphidium_dark_July_2022.csv", show_col_types = FALSE)
Monoraphidium$species <- "Monoraphidium"
# Staurastrum1 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Staurastrum1_dark", show_col_types = FALSE)
# Staurastrum1$species <- "Staurastrum1"
Staurastrum2 <- read_csv("../Class_18C_decreasingLight/Light_1perc/Data20220710/Staurastrum2_dark_July_2022.csv", show_col_types = FALSE)
Staurastrum2$species <- "Staurastrum2"
dd_all_20220710_dark <- rbind(Chlamydomonas,Cosmarium,Cryptomonas,Desmodesmus,
Monoraphidium,
# Staurastrum1,
Staurastrum2)
colnames(dd_all_20220710_dark) <- colnames
# table(dd_all_20220710_dark$species)
rm(Chlamydomonas, Cosmarium, Cryptomonas, Desmodesmus, Monoraphidium,
Staurastrum1,
Staurastrum2)
dd_all$data <- "Late 2020"
dd_all_feb22$data <- "20220201"
dd_all_20220316$data <- "20220316"
dd_all_20220406$data <- "20220406"
dd_all_20220710$data <- "20220710"
dd <- rbind(dd_all,dd_all_feb22,dd_all_20220316,dd_all_20220406, dd_all_20220710)
dd_all_feb22_dark$data <- "20220201"
dd_all_20220316_dark$data <- "20220316"
dd_all_20220406_dark$data <- "20220406"
dd_all_20220710_dark$data <- "20220710"
dd$light <- "30%"
dd_all_feb22_dark$light <- "18%"
dd_all_20220316_dark$light <- "6%"
dd_all_20220406_dark$light <- "1%"
dd_all_20220710_dark$light <- "1%"
dd <- rbind(dd,dd_all_feb22_dark,dd_all_20220316_dark,dd_all_20220406_dark,dd_all_20220710_dark)
# table(dd$species, dd$data, dd$light)
If an individual is an outlier in more than 4 traits it gets excluded (about 6% gets excluded). Outlier based on boxplot definition.
dd$id <- 1:nrow(dd)
dd_long <- dd %>%
dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
Average_Green,Average_Red,Circle_Fit,Circularity,
Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
Symmetry,Transparency,Width, id, data, light) %>%
pivot_longer(cols=-c(id,species, data, light), names_to="variable") %>%
dplyr::group_by(variable,species,data,light) %>%
mutate(iqr = IQR(value),
first_quartile = quantile(value, probs = 0.25),
third_quartile = quantile(value, probs = 0.75),
outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))
outliers <- dd_long %>%
dplyr::filter(outlier==T) %>%
dplyr::group_by(species, id,data,light) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(n>4)
## `summarise()` has grouped output by 'species', 'id', 'data'. You can override using the `.groups` argument.
# table(outliers$species)
# nrow(outliers)/nrow(dd)
dd_filtered <- dd %>%
dplyr::filter(!is.element(id,outliers$id))
dd$removed_outliers <- F
dd_filtered$removed_outliers <- T
dd_comparison <- rbind(dd,dd_filtered)
# dd_comparison %>%
# ggplot(aes(log10(Area_ABD), col=removed_outliers))+
# geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
# geom_boxplot(outlier.alpha = 0.3) +
# theme_bw() +
# facet_wrap(species~interaction(light,data), scales = "free_y") +
# geom_vline(xintercept = 1, col="black")
#
# dd_filtered %>%
# ggplot(aes(log10(Area_ABD)))+
# geom_density(aes(col=species))
We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.
dd <- dd %>%
mutate(data.light = interaction(data,light, drop = T),
data.light.species=interaction(data,light,species, drop = T))
print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##
## airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
## 20220406.1% 0 2035 0 0
## 20220710.1% 0 3449 0 0
## 20220201.18% 0 3023 0 0
## 20220201.30% 0 6004 0 0
## 20220316.30% 0 6463 0 0
## 20220406.30% 0 4391 0 0
## 20220710.30% 0 3240 0 0
## Late 2020.30% 112 1099 359 475
## 20220316.6% 0 4410 0 0
##
## Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
## 20220406.1% 0 0 93 2162 0
## 20220710.1% 0 0 135 8143 0
## 20220201.18% 0 0 176 2987 0
## 20220201.30% 0 0 1103 0 0
## 20220316.30% 0 0 328 3334 0
## 20220406.30% 0 0 429 2901 0
## 20220710.30% 0 0 34 3728 0
## Late 2020.30% 101 264 341 781 339
## 20220316.6% 0 0 164 2239 0
##
## Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
## 20220406.1% 874 0 0 0
## 20220710.1% 1088 0 0 0
## 20220201.18% 1113 0 0 0
## 20220201.30% 5122 0 0 0
## 20220316.30% 4068 0 0 0
## 20220406.30% 3655 0 0 0
## 20220710.30% 1060 0 0 0
## Late 2020.30% 1051 160 154 398
## 20220316.6% 2573 0 0 0
##
## Loxocephallus Monoraphidium OtherCiliate Small_unidentified
## 20220406.1% 0 578 0 0
## 20220710.1% 0 665 0 0
## 20220201.18% 0 1916 0 0
## 20220201.30% 0 746 0 0
## 20220316.30% 0 1062 0 0
## 20220406.30% 0 1026 0 0
## 20220710.30% 0 638 0 0
## Late 2020.30% 224 1489 164 4642
## 20220316.6% 0 855 0 0
##
## Staurastrum1 Staurastrum2 Tetrahymena
## 20220406.1% 212 91 0
## 20220710.1% 0 15 0
## 20220201.18% 111 124 0
## 20220201.30% 589 126 0
## 20220316.30% 244 52 0
## 20220406.30% 604 89 0
## 20220710.30% 0 35 0
## Late 2020.30% 510 75 108
## 20220316.6% 225 94 0
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
## airbubbles Chlamydomonas ChlamydomonasClumps
## 112 34114 359
## Coleps_irchel Colpidium ColpidiumVacuoles
## 475 101 264
## Cosmarium Cryptomonas Debris
## 2803 26275 339
## Desmodesmus Dexiostoma DigestedAlgae
## 20604 160 154
## DividingChlamydomonas Loxocephallus Monoraphidium
## 398 224 8975
## OtherCiliate Small_unidentified Staurastrum1
## 164 4642 2495
## Staurastrum2 Tetrahymena
## 701 108
Clearly, the amount of individuals per class varies a lot… I think the most limiting factor is Staurastrum2, i.e. an algae we are interested in but of which only have rather few individuals. For now at least I try to take 2 * Staurastrum2 as the number of individuals per species to be included (if a species/class has less than that all individuals are included).
Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.
n <- colsums["Staurastrum2"]*3
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=2103"
num_ind_per_class <- dd %>% group_by(species) %>%
summarize(num_class = length(unique(data.light.species)),
num_ind_per_class = floor(n/num_class)) %>%
select(species,num_ind_per_class)
num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species
set.seed(53)
split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
species <- unique(x$species)
x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)
print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##
## airbubbles Chlamydomonas ChlamydomonasClumps Coleps_irchel
## 20220406.1% 0 233 0 0
## 20220710.1% 0 233 0 0
## 20220201.18% 0 233 0 0
## 20220201.30% 0 233 0 0
## 20220316.30% 0 233 0 0
## 20220406.30% 0 233 0 0
## 20220710.30% 0 233 0 0
## Late 2020.30% 112 233 359 475
## 20220316.6% 0 233 0 0
##
## Colpidium ColpidiumVacuoles Cosmarium Cryptomonas Debris
## 20220406.1% 0 0 93 262 0
## 20220710.1% 0 0 135 262 0
## 20220201.18% 0 0 176 262 0
## 20220201.30% 0 0 233 0 0
## 20220316.30% 0 0 233 262 0
## 20220406.30% 0 0 233 262 0
## 20220710.30% 0 0 34 262 0
## Late 2020.30% 101 264 233 262 339
## 20220316.6% 0 0 164 262 0
##
## Desmodesmus Dexiostoma DigestedAlgae DividingChlamydomonas
## 20220406.1% 233 0 0 0
## 20220710.1% 233 0 0 0
## 20220201.18% 233 0 0 0
## 20220201.30% 233 0 0 0
## 20220316.30% 233 0 0 0
## 20220406.30% 233 0 0 0
## 20220710.30% 233 0 0 0
## Late 2020.30% 233 160 154 398
## 20220316.6% 233 0 0 0
##
## Loxocephallus Monoraphidium OtherCiliate Small_unidentified
## 20220406.1% 0 233 0 0
## 20220710.1% 0 233 0 0
## 20220201.18% 0 233 0 0
## 20220201.30% 0 233 0 0
## 20220316.30% 0 233 0 0
## 20220406.30% 0 233 0 0
## 20220710.30% 0 233 0 0
## Late 2020.30% 224 233 164 2103
## 20220316.6% 0 233 0 0
##
## Staurastrum1 Staurastrum2 Tetrahymena
## 20220406.1% 212 91 0
## 20220710.1% 0 15 0
## 20220201.18% 111 124 0
## 20220201.30% 300 126 0
## 20220316.30% 244 52 0
## 20220406.30% 300 89 0
## 20220710.30% 0 35 0
## Late 2020.30% 300 75 108
## 20220316.6% 225 94 0
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
## airbubbles Chlamydomonas ChlamydomonasClumps
## 112 2097 359
## Coleps_irchel Colpidium ColpidiumVacuoles
## 475 101 264
## Cosmarium Cryptomonas Debris
## 1534 2096 339
## Desmodesmus Dexiostoma DigestedAlgae
## 2097 160 154
## DividingChlamydomonas Loxocephallus Monoraphidium
## 398 224 2097
## OtherCiliate Small_unidentified Staurastrum1
## 164 2103 1692
## Staurastrum2 Tetrahymena
## 701 108
trainingdata <- trainingdata[complete.cases(trainingdata),]
# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
p = 0.65, # percentage of data as training
list = FALSE)
testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- comp_name
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
Average_Blue + Average_Green + Average_Red + Circle_Fit +
Circularity + Compactness +
Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
Width
set.seed(8765)
# classifiers_18c <- list()
# classifiers_18c_plots <- list()
# classifiers_18c_assess_plots <- list()
completeList <- mclapply(1:length(compositions_list), function(i){
if("Colpidium" %in% compositions_list[[i]]) {
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
sub_testdata <- testdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
} else {
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
"DigestedAlgae","DividingChlamydomonas"))
sub_testdata <- testdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
"DigestedAlgae","DividingChlamydomonas"))
}
sub_trainingdata$species <- factor(sub_trainingdata$species)
sub_testdata$species <- factor(sub_testdata$species)
# split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
# subsamples <- lapply(split_up, function(x) {
# x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
# sub_trainingdata <- do.call("rbind", subsamples)
# # A stratified random split of the data
# idx_train <- createDataPartition(sub_trainingdata$species,
# p = 0.7, # percentage of data as training
# list = FALSE)
#
# sub_testdata <- sub_trainingdata[-idx_train,]
# sub_trainingdata <- sub_trainingdata[idx_train,]
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
sub_testdata <- sub_testdata %>%
dplyr::mutate(predicted = predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
correct = species==predicted)
sub_testdata_summ <- sub_testdata %>%
group_by(data, species, light, correct) %>%
summarize(n = n()) %>%
dplyr::mutate(perc_corr = n/sum(n),
composition = names(compositions_list)[i]) %>%
dplyr::filter(correct==T)
classified_test_data <- sub_testdata_summ
classifiers_18c_plots <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
classifiers_18c_assess_plots <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
list(classifiers_18c, classified_test_data, classifiers_18c_plots, classifiers_18c_assess_plots)
}, mc.cores = detectCores()-3)
classifiers_18c <- map(completeList, 1)
classified_test_data <- map(completeList, 2)
classifiers_18c_plots <- map(completeList, 3)
classifiers_18c_assess_plots <- map(completeList, 4)
classified_test_data <- do.call("rbind", classified_test_data)
names(classifiers_18c_plots) <- names(classifiers_18c) <-
names(classifiers_18c_assess_plots) <- comp_name
There are mainly 4 measures:
Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)
Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)
Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)
Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)
PPV is the most important for us!
library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] +
plot_layout(widths = c(7,3)))
}
classified_test_data <- classified_test_data%>%
dplyr::mutate(data=ifelse(data=="Late 2020","2020late",data),
tot_n = n/perc_corr, data=factor(data, ordered = T,
levels=c("2020late","20220201","20220301","20220316","20220406","20220706")),
light_num = as.numeric(gsub("%","",light)))
# classified_test_data %>%
# dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
# ggplot(aes(data, perc_corr, col=composition, shape=light, size=tot_n)) +
# geom_hline(yintercept = 0.8, col="red") +
# geom_hline(yintercept = 0.9, col="green")+
# geom_point(position = position_jitter(0.2))+
# facet_wrap(~species, nrow = 3)+
# theme_bw() +
# scale_size_continuous(breaks = c(1,10,20,30)) +
# labs(size="Sample size of evaluation data", y="Correct identification in percentage", x="Date of data collection",
# title="Species identification across date, light intensities and compositions")
classified_test_data %>%
dplyr::filter(species %in% c("Chlamydomonas", "Cosmarium", "Desmodesmus", "Monoraphidium", "Staurastrum1", "Staurastrum2", "Cryptomonas")) %>%
ggplot(aes(light_num, perc_corr))+
geom_hline(yintercept = 0.8, col="red") +
geom_point(aes(fill=composition,size=tot_n),shape=21,col="black",alpha=0.3)+
facet_wrap(~species, nrow = 3)+
scale_size_continuous(breaks = c(1,10,20,30)) +
geom_smooth(method = "lm")+
theme_bw() +
labs(size="Sample size of evaluation data", y="Correct species identification in percentage", x="Light intensity in percentage",
title="Species identification across light intensities and compositions")
## `geom_smooth()` using formula 'y ~ x'
# classified_test_data %>%
# dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
# ggplot(aes(tot_n,perc_corr, col=species))+
# geom_point()+
# theme_bw()
saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_20220710_MergedData.rds")
# labels <- dd_all %>%
# group_by(species) %>%
# summarise(xPos = max(Area_Filled),
# yPos = max((density(Area_Filled))$y))
#
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))